Text analysis: title and abstract of male and female speakers

Abstracts

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)

Using abstracts in English (original or translated)

data <- data  %>% filter(!is.na(abstract_english)) 

Number of abstracts per group

table(data$gender)
## 
##   F   M 
## 101 139
table(data$position_cat,data$gender)
##            
##              F  M
##   others     4  1
##   postdoc   21 21
##   professor 21 60
##   student   53 56

Tidytext

text_tok <- data %>% dplyr::select(id,gender,position_cat, audience_n,
                             abstract_english, title_english) %>%
  mutate(text = paste(title_english, abstract_english)) %>%
  unnest_tokens(output=word,input=text)

stop_w <- tibble(word = stopwords("en"))

# remove stopwords
text <- text_tok %>% 
  anti_join(stop_w, by="word")  %>% arrange(word) 

# remove numbers and other characters
text <- text %>% slice(-c(1:290)) %>% # number and some symbols
          filter(nchar(word)!=1) %>% # letters alone
          filter(!word %in% c("mpas", "ÎŽ13c", "ÎČ") )# remove acronyms, symbols


# solving some simple plurals
plural <- c("actions","advances", "adaptations", "amphibians", "animals", "ants","anurans",
            "applications","approaches", "bees","builds", "birds",
            "cerrados","challenges",
            "continents","crops", 
            "decisions","declines","determines","determinants", "defenses",
            "dynamics",
            "economics", "ecosystems","environments", "experiences",
            "forests",
            "genetics","gifts","gradients","guides","impacts",
            "increases","interactions","lives",
            "landscapes","males","mammals", "mangroves","models","movements",
            "mutualisms","networks","neotropics",
            "opilions","phenotypes","plants","projects","paths", "perspectives",
            "populations","promotes","relationships", "relations",
            "resources","responses","roads","services","skulls","snakes","seeds",
            "spaces", "spiders","stages", "trees", "variations",
            "threats")

text$word[text$word %in% plural] <- 
  substr(text$word[text$word %in% plural],
       1,nchar(text$word[text$word %in% plural])-1)
  • Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
               c("advancement", "advance"),
               c("agricultural", "agriculture"),
               c("agro", "agriculture" ),
               c("amazonia","amazon" ),
               c("amazonian","amazon" ),
               c("andean","andes"),
               c("apply","application"),
               c("applying","application"),
               c("apidae","apis"),
               c("arachnida","arachnid"),
               c("argue","argument"),
               c("basal", "basis"),
               c("behavioral","behavior"),
               c("behavioural","behavior"),
               c("bignonieae", "bignoniaceae"),
               c("biological", "biology"),
               c("brazilian","brazil"),
               c("building","build"),
               c("changing", "change"),
               c("cnidarian", "cnidaria"),
               c("coastal","coast"),
               c("colour", "color"),
               c("colors", "color"),
               c("communities","community" ),
               c("competitive", "competition"),
               c("complexity", "complex"),
               c("convergences", "convergence"),
               c("convergent", "convergence"),
               c("cordatus","cordata" ),
               c("croplands","crop"),
               c( "cultural", "culture"),
               c("darwin's", "darwin"),
               c("darwinian", "darwin"),
               c("defensive", "defense"),
               c("dependent","dependence"),
               c("detecting","detection"),
               c("determine", "determinant"),
               c("developmental", "development"),
               c("dispersers","dispersal"),
               c("disturbed", "disturbance"),
               c("diversification", "diversity"),
               c("dragonflies", "dragonfly"),
               c("drier", "drought"),
               c("ecological", "ecology"),
               c("ecologists", "ecology"),
               c("endemic", "endemism"),
               c("effectiveness", "efficiency"),
               c("environmental", "environment"),
               c("evolutionary", "evolution"),
               c("expanding", "expansion"),
               c("extinct", "extinction"),
               c("facilitate", "facilitation"),
               c("fisheries", "fishery"),
               c("floral", "flora"),
               c("floristic", "flora"),
               c("forested", "forest"),
               c("functional", "function"),
               c("functionally", "function"),
               c("functioning", "function"),
               c("geographical", "geographic"),
               c("heterogeneties", "heterogeneity"),
               c("heterogeneous", "heterogeneity"),
               c("histories", "history"),
               c("integrated", "integration"),
               c("intregating", "integration"),
               c("integrative", "integration"),
               c("invasive", "invasion"),
               c("isotopic", "isotope"),
               c("linking", "link"),
               c("living", "live"),
               c("mammalia", "mammal"),
               c("managed", "manage"),
               c("managers", "manage"),
               c("mathematical", "mathematics"),
               c("mates", "mating"),
               c("mediated", "mediate"),
               c("mechanistic", "mechanism"),
               c("matrices", "matrix"),
               c("migratory", "migration"),
               c("mimicking", "mimicry"),
               c("modeling", "model"),
               c("mutualistic", "mutualism"),
               c("natural", "nature"),
               c("neotropical", "neotropic"),
               c("northeastern", "northeast"),
               c("occuring", "occur"),
               c("onça", "onca"),
               c("opiliones", "opilion"),
               c("parasite", "parasitism"),
               c("parent", "parenting"),
               c("phylogenies", "phylogeny"),
               c("phylogenetic", "phylogeny"),
               c("phylogenomic", "phylogeny"),
               c("pollinators", "pollination"),
               c("protected", "protect"),
               c("protective", "protect"),
               c("rainfall", "rain"),
               c("reconstructing", "reconstruction"),
               c("regulatory", "regulation"),
               c("regulates", "regulation"),
               c("relation", "relationship"),
               c("reproductive", "reproduction"),
               c("restored", "restoration"),
               c("robustness", "robust"),
               c("scientific", "science"),
               c("scientist", "science"),
               c("sexy", "sexual"),
               c("simulated", "simulation"),
               c("societies", "society"),
               c("social", "society"),
               c("socio", "society"),
               c("space", "spatial"),
               c("spacio", "spatial"),
               c("stabilize", "stability"),
               c("stable", "stability"),
               c("stories", "story"),
               c("strategic", "strategy"),
               c("strategies", "strategy"),
               c("structured", "structure"),
               c("structuring", "structure"),
               c("studies", "study"),
               c("studing", "study"),
               c("sustainable", "sustainability"),
               c("theories", "theory"),
               c("theoretical", "theory"),
               c("threatened", "threat"),
               c("tropical", "tropic"),
               c("vision", "visual")
               )
lemma <- as.data.frame(lemma)

for (i in 1:dim(lemma)[1]){
  text$word[text$word == lemma[i,1]] <- lemma[i,2]
}

WORDS - aLl data

table(text$gender)
## 
##     F     M 
## 10812 13614
table(text$position_cat ,text$gender)
##            
##                F    M
##   others     262  139
##   postdoc   2793 2494
##   professor 2062 5370
##   student   5524 5531

Mean number of words by abstract

text %>% count(id,gender) %>%
  ggplot(aes(x=gender, y=n)) + 
  geom_violin() + geom_boxplot(width=0.2)+
  ggbeeswarm::geom_quasirandom(size=3, shape=21) 

20 palavra mais comuns

text %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
word n
species 384
ecology 188
forest 175
model 157
study 156
environment 137
evolution 134
can 132
landscape 127
population 124
diversity 112
nature 102
community 100
male 97
plant 97
different 95
patterns 88
present 88
areas 84
interaction 82

Word cloud

textplot_wordcloud(x=dfm(tokens(text$word)))

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
                   col="#FCA532")

Word frequencies by gender

props <- text %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]
ggplot(props, aes(x=proportion_M,, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
  #geom_point(size=2.5, alpha=0.5)+
  geom_jitter(size=2.5, alpha=0.2)+
  geom_text_repel(aes(label=label), size=3.2)+
  scale_x_log10(name="Male most used words",
                labels = percent_format()) +
  scale_y_log10(name="Female most used words",
                labels = percent_format()) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq.jpg", height = 5, width=7)

Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.

Correlation of word frequeency use between gender:

cor.test(props$proportion_F, props$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  props$proportion_F and props$proportion_M
## t = 70.789, df = 1677, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8530819 0.8771137
## sample estimates:
##       cor 
## 0.8655954

Highly correlated -> it means they tend to use the same frequency of main word

20 words with the largest differences in frequency

prop2 <- props %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))

ggsave("figures/abstract_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)

TF IDF

text_id <- text %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))

10 “exclusive” words for each group

text_id$word <- as.factor(text_id$word)
text_id %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(10, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

WORDS professors only data

textP <- text %>% filter(position_cat == "professor")

table(textP$gender)
## 
##    F    M 
## 2062 5370

Mean number of words by abstract

textP %>% count(id,gender) %>%
  ggplot(aes(x=gender, y=n)) + 
  geom_violin() + geom_boxplot(width=0.2)+
  ggbeeswarm::geom_quasirandom(size=3, shape=21) 

20 palavra mais comuns

textP %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
word n
species 88
ecology 67
evolution 52
population 52
environment 50
nature 44
plant 43
model 41
study 41
can 39
ecosystem 38
diversity 35
society 32
water 32
pollination 30
research 30
interaction 29
science 29
biology 28
present 26

Counting words Frequency by gender

propsP <- textP %>%
    count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]
ggplot(propsP, aes(x=proportion_M, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
 # geom_point(size=2.5, alpha=0.3) +
  geom_jitter(size=2.5, alpha=0.3)+
  geom_text_repel(aes(label=label), size=3)+
  scale_x_log10(name="Male most used words",   limits=c(0.0003,0.02),
                labels = percent_format()) +
  scale_y_log10(name="Female Most used words", limits=c(0.0003,0.02),
                labels = percent_format()) +
  scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
                       labels=percent_format())  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))

 # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)

Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females.

Labels for the 20 words with largest differences in frequency.

Correlation of word frequeency use between gender:

cor.test(propsP$proportion_F, propsP$proportion_M)
## 
##  Pearson's product-moment correlation
## 
## data:  propsP$proportion_F and propsP$proportion_M
## t = 20.607, df = 559, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6072948 0.7016627
## sample estimates:
##       cor 
## 0.6570452

20 words with the largest differences in frequency

propP2 <- propsP %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
                     labels = c(0.03,0.02,-0.01,0,0.01,0.02,0.03))

ggsave("figures/abstract_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)

Topic model - all data

matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)

3 models with different number of topics, comparing AIC - cross validation is better? loo

ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)
##         AIC      dAIC     df   
## ap_lda2 377783.9      0.0 9919 
## ap_lda3 379803.9   2020.0 14878
## ap_lda4 383258.3   5474.4 19837

Word-topic probabilities

10 words with the largest probabilities for each group

ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

Document-topic probabilities - classifying the abstracts

ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)
##    
##      1  2
##   F 55 46
##   M 62 76
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) 
##  gender   1   2
##       F 54% 46%
##       M 45% 55%
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ gender)

Topic model Professors only

matextP <- textP %>% 
  count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)
ap_lda2 <- LDA(matextP, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matextP, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matextP, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)
##         AIC      dAIC     df   
## ap_lda2 113448.6      0.0 5203 
## ap_lda3 115229.2   1780.6 7804 
## ap_lda4 117829.0   4380.4 10405

word-topic probabilities

ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()

Document-topic probabilities

ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)
##    
##      1  2
##   F 11 10
##   M 36 23
library(janitor)
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) 
##  gender   1   2
##       F 52% 48%
##       M 61% 39%
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  geom_violin()+
  facet_wrap(~ gender)

Sentiment analysis

Chapter 2, Silge & RObinson. 2018

  • The NRC lexicon categorizes words in a binary fashion (“yes”/“no”) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")
## # A tibble: 13,875 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # 
 with 13,865 more rows
  • The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # 
 with 6,776 more rows
  • The AFINN lexicon assigns words with a score that runs between -5 and 5, with neg‐ ative scores indicating negative sentiment and positive scores indicating positive sen‐ timent.
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # 
 with 2,467 more rows

PENSAR: tem que levar em conta nĂșmero de palavras diferentes entre abstracts - principalmente se ouver diferença mĂ©dia de nĂșmero de palavras por abstract de homens e mulehres nĂ©? ou nĂŁo?

Score words difference in female and male abstracts

All data

affword <- get_sentiments("afinn")

affc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(affword, "word")

Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:

affc %>% group_by(id, gender) %>%
  summarise(mean.score = mean(value),
            weig.score = weighted.mean(value,n)) %>%
  ggplot(aes(x=gender,y=weig.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
    geom_quasirandom()+
  ggtitle("Mean words score per abstract and gender")

Professors

affword <- get_sentiments("afinn")

affc <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(affword, "word")

Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:

affc %>% group_by(id, gender) %>%
  summarise(mean.score = mean(value),
            weig.score = weighted.mean(value,n)) %>%
  ggplot(aes(x=gender,y=weig.score)) +
  geom_violin() +
  geom_boxplot(width=0.1) +
    geom_quasirandom()+
  ggtitle("Mean words score per abstract and gender")

Frequency of sentiment words per abstract

As classificaçÔes das palavras não me parecem muito acuradas com a linguagem científica.

Precisa saber como ponderar pelo total de palavras.

All data

nrcword <- get_sentiments("nrc")

nrc <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(nrc, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin() +
    geom_quasirandom()

Professors

nrcword <- get_sentiments("nrc")

nrc <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(nrcword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(nrc, aes(x=gender, y=n)) +
  facet_wrap(~sentiment) +
  geom_violin()+
    geom_quasirandom()

nrc %>% filter(sentiment == "positive") %>%
ggplot( aes(x=gender, y=n)) +
  geom_violin() +
  geom_boxplot(width=0.2) +
    geom_quasirandom()+
  ggtitle("Positive words")

Frequency of sentiment words per abstract

All data

bingword <- get_sentiments("bing")

bing <- text %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin()+
    geom_quasirandom()

Professors

bingword <- get_sentiments("bing")

bing <- textP %>%
  count(id,gender,word, sort = TRUE) %>%
  inner_join(bingword, "word") %>%
  group_by(id,gender,sentiment) %>%
  summarise(n= sum(n))

ggplot(bing, aes(x=sentiment, y=n)) +
  facet_wrap(~gender) +
  geom_violin() +
  geom_boxplot(width=0.2) +
    geom_quasirandom()